rm(list = ls())

# Purpose: Calculate FREX statistic

# Reference: Roberts ME, Stewart BM, Airoldi E (2016b). "A model of
# text for experimentation in the social sciences." Journal of the
# American Statistical Association, 111(515), 988–1003.

args <- commandArgs(trailingOnly=TRUE)

### PARAMETERS ########################
topic.model <- as.numeric(args[1])
weight <- 0.5
#######################################

### PATHS ##############################
word.id.file <- "./data/word_ids.dat"
input.file.path <- "./data"
output.file.path <- "./data/FREX_values"
#######################################

words <- read.table(word.id.file, stringsAsFactors=FALSE)
no.words <- nrow(words)
rm(words)

# Output for current model
cat(paste("Calculating FREX for",no.words,"words in model",topic.model,"\n"))

# Open file with topic-word probabilities
fname <- file.path(input.file.path, paste0("avg_probabilities_topic_",topic.model,".csv"))
probs <- read.table(fname, header=FALSE, row.names=1, stringsAsFactors=FALSE)
names(probs) <- paste0("topic",seq(1,topic.model))

# calculate exclusivity matrix
total.word.probs <- rowSums(probs)
normalized.probs <- probs/total.word.probs

# calculate ECDF for each word probability
ecdf.prob.matrix <- matrix(NA, nrow=no.words, ncol=topic.model)
for (i in 1:topic.model) {

    cat(paste("... calculating ECDF for word probs for topic",i,"\n"))
    ecdf.prob.matrix[,i] <- sapply(probs[,i], function(x) ecdf(probs[,i])(x))
}

# calculate ECDF for each normalized word probability
ecdf.normalized.prob.matrix <- matrix(NA, nrow=no.words, ncol=topic.model)
for (i in 1:topic.model) {

    cat(paste("... calculating ECDF for normalized word probs for topic",i,"\n"))
    ecdf.normalized.prob.matrix[,i] <- sapply(normalized.probs[,i], function(x) ecdf(normalized.probs[,i])(x))
}

# calculate FREX matrix
frex <- 1/( (weight/ecdf.normalized.prob.matrix) + ((1-weight)/ecdf.prob.matrix) )


# write results to output file
fname <- file.path(output.file.path, paste0("FREX_values_topic_model_",topic.model,".csv"))
write.table(frex, file=fname, row.names=FALSE, col.names=FALSE)

